Generalities

Import required libraries

if (!require("ggplot2")) install.packages("ggplot2")
if (!require("stringr")) install.packages("stringr")
if (!require("ggrepel")) install.packages("ggrepel")
if (!require("dplyr")) install.packages("dplyr")
if (!require("patchwork")) install.packages("patchwork")
if (!require("BiocManager")) install.packages("BiocManager")
if (!require("ggbreak")) install.packages("ggbreak")

if (!require("clusterProfiler")) BiocManager::install("clusterProfiler")
if (!require("AnnotationDbi")) BiocManager::install("AnnotationDbi")
if (!require("org.Hs.eg.db")) BiocManager::install("org.Hs.eg.db")
if (!require("org.Hs.eg.db")) BiocManager::install("DESeq2")

library(ggplot2)
library(stringr)
library(ggrepel)
library(dplyr)
library(patchwork)
library(clusterProfiler)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(DESeq2)
library(plotly)
library(ggbreak) 

Import dataset with gene counts

genes_list <- read.delim(file.path(project_dir, "GSE107593_raw_reads_BCHRNAseq.txt"), check.names=F)

Raw counts matrix

genes_list[,c(3,10:ncol(genes_list))]

Obtaining metadata information

Patient IDs

# Read GSE107593_series_matrix.txt
metadata_matrix <- read.delim(file.path(project_dir, "Dataset copy, untouched", "GSE107593_series_matrix.txt"), check.names=F, skip = 25)
metadata_matrix_2 <- data.frame(t(metadata_matrix[,2:ncol(metadata_matrix)]))
colnames(metadata_matrix_2) <- metadata_matrix[,1]

# Obtain patient ids
patient_ids <- unique(str_replace(metadata_matrix_2[,12], "subject: ",""))

for (el in patient_ids){
  cat(paste0(el,"\n"))
}
157
877
1057
1077
1192
1214
8854
8855
8874
8878
8879
8881

Area from which samples were obtained

# Obtain sampling locations
location <- unique(str_replace(metadata_matrix_2[,11], "location: ",""))

for (el in sort(location)){
  cat(paste0(el,"\n"))
}
Ascending
Descending
Large
Rectum
Sigmoid
Tranverse

Principal component analysis (PCA)

PCA Pre-normalization

Scree plot

# Prepare data for PCA
genes_list_for_pca <- t(genes_list[,c(10:ncol(genes_list))])
colnames(genes_list_for_pca) <- genes_list[,3]

# PCA
pca <- prcomp(genes_list_for_pca, scale=F)

# Screeplot
pca_metrics <- as.data.frame(t(summary(pca)$importance))
colnames(pca_metrics) <- c("Standard_deviation", "Proportion_of_Variance", "Cumulative_Proportion")
pca_metrics$PCs <- as.numeric(substr(rownames(pca_metrics), 3, length(rownames(pca_metrics))))

ggplot(data = pca_metrics[1:10,], aes(x=PCs, y=Proportion_of_Variance)) +
  geom_line(color="blue") +
  geom_point() +
  scale_x_continuous(breaks = c(1:10)) +
  labs(title="PCA - Variance explained per principal component")+
  xlab("Principal component")+
  ylab("Proportion of variance explained")+
  theme_light()+
  theme(plot.title = element_text(size=12),
        axis.title.x = element_text(size = 11),
        axis.title.y = element_text(size = 11),
        text = element_text(size=11),
        aspect.ratio = 1/1.5)

# Create dataframe to contain relevant metadata in appropriate format
df_samples_treatments <- data.frame(row.names = rownames(metadata_matrix_2))

# Get sample name and inflammation status
df_samples_treatments$sample_inflamation <- ""
for (row in (1:nrow(metadata_matrix_2))){
  df_samples_treatments[row, ncol(df_samples_treatments)] <- str_split(
    rownames(df_samples_treatments)[row],
    "Ulcerative colitis Colon Biopsy ",
    simplify = T)[2]}

# Get sample name
df_samples_treatments$sample <- ""
for (row in (1:nrow(df_samples_treatments))){
  df_samples_treatments[row, ncol(df_samples_treatments)] <- gsub(" [^ ]*$", "", df_samples_treatments[row, ncol(df_samples_treatments)-1])
}

# Get inflammation status
df_samples_treatments$inflammation <- ""
for (row in (1:nrow(df_samples_treatments))){
  df_samples_treatments[row, ncol(df_samples_treatments)] <- gsub(".* ", "", df_samples_treatments[row, ncol(df_samples_treatments)-2])
}

# Get sample location
df_samples_treatments$location <- ""
for (row in (1:nrow(df_samples_treatments))){
  df_samples_treatments[row, "location"] <- gsub("location: ", "", metadata_matrix_2[row, 11])
}

# Get sampling hospital
df_samples_treatments$hospital <- ""
for (row in (1:nrow(df_samples_treatments))){
  df_samples_treatments[row, "hospital"] <- gsub("site: ", "", metadata_matrix_2[row, 10])
}

# Create dataframe out of the pca scores
pca_scores <- data.frame(pca$x)[1:10]

# Get the patient id in the pca scores dataframe
pca_scores$patient <- ""

for (row in (1:nrow(pca_scores))){
  for (id_patient in 1:length(patient_ids)){
    if(!is.na(str_extract(rownames(pca_scores[row,]), patient_ids[id_patient]))){
      pca_scores[row, ncol(pca_scores)] <- str_extract(rownames(pca_scores[row,]), patient_ids[id_patient])
    }
  }
}

# Get the sample id in the PCA scores dataframe
pca_scores$sample <- rownames(pca_scores)

# Get the inflammation status of each sample
pca_scores$inflammation <- ""
for(row_scores in 1:nrow(pca_scores)){
  for(row_df in 1:nrow(df_samples_treatments)){
    if(pca_scores[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
      pca_scores[row_scores, "inflammation"] <- df_samples_treatments[row_df, "inflammation"]
    }
  }
}

# Get the sample location in the PCA scores dataframe
pca_scores$location <- ""
for(row_scores in 1:nrow(pca_scores)){
  for(row_df in 1:nrow(df_samples_treatments)){
    if(pca_scores[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
      pca_scores[row_scores, "location"] <- df_samples_treatments[row_df, "location"]
    }
  }
}

# Get the sampling hospital in the PCA scores dataframe
pca_scores$hospital <- ""
for(row_scores in 1:nrow(pca_scores)){
  for(row_df in 1:nrow(df_samples_treatments)){
    if(pca_scores[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
      pca_scores[row_scores, "hospital"] <- df_samples_treatments[row_df, "hospital"]
    }
  }
}

PCA Scores

p1 <- ggplot(pca_scores, aes(x=PC1, y=PC2, colour = inflammation)) +
  geom_point(size=2)+
  labs(title = "Inflammation status")+
  theme_light()+
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1)+
  guides(colour=guide_legend(ncol=1))

p2 <- ggplot(pca_scores, aes(x=PC1, y=PC2, colour = location)) +
  geom_point(size=2)+
  labs(title = "Sample location")+
  theme_light()+
theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1)+
  guides(colour=guide_legend(ncol=2))

p1+p2+plot_annotation(title = "PCA scores according to sample metadata - inflammation status and sample location",
                      subtitle = "Counts not normalized",
                      theme = theme(plot.title = element_text(hjust = 0.5, vjust = 3),
                                    plot.subtitle = element_text(hjust = 0.5, vjust = 3)))

Lognormalization of counts

# Create df for normalization:
genes_list_lognorm <- data.frame(matrix(NA, nrow = nrow(genes_list), ncol = ncol(genes_list)))

# Columns before counts
for (col in 1:9){
  genes_list_lognorm[[col]] <- genes_list[[col]]
}

# Column names
colnames(genes_list_lognorm) <- colnames(genes_list)

for (i in 10:ncol(genes_list)) {
  genes_list_lognorm[, i] <- log2((genes_list[, i] / colSums(genes_list[i]) * 1000000)+8)
}

genes_list_lognorm[,c(3,10:ncol(genes_list))]

PCA post lognormalization

Scree plot

# Create dataframe for PCA of lognormalized samples
genes_list_lognorm_for_pca <- t(genes_list_lognorm[,10:ncol(genes_list_lognorm)])
colnames(genes_list_lognorm_for_pca) <- rownames(genes_list_lognorm)

# PCA of lognorm counts
pca_lognorm <- prcomp(genes_list_lognorm_for_pca)

# Screeplot
pca_lognorm_metrics <- as.data.frame(t(summary(pca_lognorm)$importance))
colnames(pca_lognorm_metrics) <- c("Standard_deviation", "Proportion_of_Variance", "Cumulative_Proportion")
pca_lognorm_metrics$PCs <- as.numeric(substr(rownames(pca_lognorm_metrics), 3, length(rownames(pca_lognorm_metrics))))

ggplot(data = pca_lognorm_metrics[1:10,], aes(x=PCs, y=Proportion_of_Variance)) +
  geom_line(color="blue") +
  geom_point() +
  scale_x_continuous(breaks = c(1:10)) +
  labs(title="PCA of lognormalized counts - Variance explained per principal component")+
  xlab("Principal component")+
  ylab("Proportion of variance explained")+
  theme_light()+
  theme(plot.title = element_text(size=12),
        axis.title.x = element_text(size = 11),
        axis.title.y = element_text(size = 11),
        text = element_text(size=11),
        aspect.ratio = 1/1.5)

# Create pca scores dataframe for lognormalized counts pca
pca_scores_lognorm <- data.frame(pca_lognorm$x[,1:10])
pca_scores_lognorm$patient <- pca_scores$patient
pca_scores_lognorm$sample <- pca_scores$sample
pca_scores_lognorm$inflammation <- pca_scores$inflammation
pca_scores_lognorm$location <- pca_scores$location
pca_scores_lognorm$hospital <- pca_scores$hospital

PCA Scores

p1 <- ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = inflammation)) +
  geom_point(size=2)+
  labs(title = "Inflammation status")+
  theme_light()+
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1)+
  guides(colour=guide_legend(ncol=1))

p2 <- ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = location)) +
  geom_point(size=2)+
  labs(title = "Sample location")+
  theme_light()+
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1)+
  guides(colour=guide_legend(ncol=2))

p1+p2+plot_annotation(title = "PCA scores according to sample metadata - inflammation status and sample location",
                      subtitle = "Counts log-normalized",
                      theme = theme(plot.title = element_text(hjust = 0.5, vjust = 3),
                                    plot.subtitle = element_text(hjust = 0.5, vjust = 3)))

ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = patient, shape = inflammation, label=patient)) +
  geom_point(size=3)+
  geom_text_repel(nudge_x = 2)+
  labs(title = "PCA scores by patient ID and inflammation status")+
  theme_light()+
  theme(plot.title = element_text(hjust=0.5))

# Extract the rotations for both pcas (raw and lognormalized)
pca_rotations <- data.frame(pca_lognorm$rotation[,1:10])
pca_lognorm_rotations <- data.frame(pca_lognorm$rotation[,1:10])

# Add column with gene names to rotations dataframes
pca_rotations$gene_name <- genes_list$gene_name
pca_lognorm_rotations$gene_name <- genes_list$gene_name

GO enrichment PC1 loadings

# Show genes with higher rotation values for pca (raw counts)
top_pos_pca_lognorm <- pca_lognorm_rotations %>% arrange(desc(PC1)) %>% dplyr::select(c(PC1, gene_name))
top20_pos_pca_lognorm <- top_pos_pca_lognorm[1:20,]
top20_pos_pca_lognorm$gene_name
 [1] "IGHG1"    "DUOX2"    "IGHG3"    "PI3"      "DUOXA2"   "IGHG2"   
 [7] "LCN2"     "CHI3L1"   "CXCL1"    "IGHG4"    "NOS2"     "REG1A"   
[13] "SLC6A14"  "MMP3"     "C3"       "DMBT1"    "IGHV4-34" "REG4"    
[19] "PLA2G2A"  "IGFBP5"  
# DO 
GO_PC1_pos <- enrichGO(gene = top20_pos_pca_lognorm$gene_name, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")

temp_GO_PC1_pos <- as.data.frame(GO_PC1_pos)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_PC1_pos$Description <- strtrim(temp_GO_PC1_pos$Description, 50)
rownames(temp_GO_PC1_pos) <- 1:nrow(temp_GO_PC1_pos)
temp_GO_PC1_pos
# Show genes with more negative rotation values for pca (raw counts)
top_neg_pca_lognorm <- pca_lognorm_rotations %>% arrange(PC1) %>% dplyr::select(c(PC1, gene_name))
top20_neg_pca_lognorm <- top_neg_pca_lognorm[1:20,]
top20_neg_pca_lognorm$gene_name
 [1] "AQP8"     "HMGCS2"   "SLC26A2"  "CA1"      "ANPEP"    "PCK1"    
 [7] "CHP2"     "B4GALNT2" "GUCA2A"   "ADH1C"    "PADI2"    "SLC26A3" 
[13] "ABCG2"    "FABP1"    "CA2"      "LYPD8"    "CKB"      "SLC51A"  
[19] "UGT2A3"   "SLC9A3"  
GO_PC1_neg <- enrichGO(gene = top20_neg_pca_lognorm$gene_name, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")

temp_GO_PC1_neg <- as.data.frame(GO_PC1_neg)[,c("ID", "Description", "GeneRatio", "p.adjust")]
rownames(temp_GO_PC1_neg) <- 1:nrow(temp_GO_PC1_neg)
temp_GO_PC1_neg

Differential expression analysis

cts <- as.matrix(genes_list[,c(10:ncol(genes_list))])
rownames(cts) <- genes_list[[3]]

coldata <- pca_scores[,c(11, 13:15)]
coldata$inflammation <- factor(coldata$inflammation)
coldata$location <- factor(coldata$location)
coldata$hospital <- factor(coldata$hospital)

cts <- cts[, rownames(coldata)]

as.data.frame(cts)
coldata
dds <- suppressMessages(DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ inflammation))
smallestGroupSize <- 3
keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
dds <- dds[keep,]
dds$inflamation <- relevel(dds$inflammation, ref = "Non-Inflamed")
dds <- suppressMessages(DESeq(dds))
res <- results(dds)
gene_list <- as.data.frame(res)
gene_list$Gene <- rownames(gene_list)
gene_list %>% filter(padj<0.05) %>% arrange(padj)
gene_list$significance <- NA
for (gene in 1:nrow(gene_list)){
  if (gene_list[gene,"log2FoldChange"]>=1 & gene_list[gene,"padj"]< 0.05){
    gene_list[gene, "significance"] <- "Up"
  }  
  
  else if (gene_list[gene,"log2FoldChange"]<=(-1) & gene_list[gene,"padj"]< 0.05){
    gene_list[gene, "significance"] <- "Down"
  } 
  
  else if (abs(gene_list[gene,"log2FoldChange"])<1 | gene_list[gene,"padj"]>= 0.05){
    gene_list[gene, "significance"] <- "No sig"
  }
}

gene_list_sig_down_outlier <- gene_list %>% filter(log2FoldChange < (-20))
p1 <- ggplot(gene_list_sig_down_outlier, aes(x=log2FoldChange, y=-log10(padj), label=Gene)) + 
  geom_point(color="#6696ea") + 
  xlim(-31, -19) + 
  scale_x_continuous(breaks=c(-30, -25, -20), expand = expansion(mult = c(0.01, 0.01))) + 
  ylim(0,62) +
  theme_light() +
  annotate("text", x = -30, y = -5, label = "-30") + 
  annotate("text", x = -20, y = -5, label = "-20") +
  theme(panel.border = element_blank(),
        axis.title.x = element_blank())+
  geom_vline(xintercept = -30)+
  geom_hline(yintercept=0)+
  geom_hline(yintercept=62)
Scale for x is already present.
Adding another scale for x, which will replace the existing scale.
p2 <- ggplot(gene_list%>%filter(log2FoldChange>(-20)), aes(x=log2FoldChange, y=-log10(padj), label=Gene, color=significance))+
  geom_point() +
  xlim(-10, 10)+
  scale_x_continuous(breaks=seq(-10, 10, 5), expand = expansion(mult = c(0.01, 0.01))) + 
  ylim(0,62)+
  theme_light()+
  theme(axis.title.y = element_blank(), 
        axis.text.y = element_blank(),
        panel.border = element_blank())+
  geom_vline(xintercept = 10)+
  geom_hline(yintercept=0)+
  geom_hline(yintercept=62)+
  scale_color_manual(values = c("Up"="#b02428",
                                "No sig"="grey",
                                "Down"="#6697ea"))+
  labs(color = "Significance")
Scale for x is already present.
Adding another scale for x, which will replace the existing scale.
p1plotly <- ggplotly(p1)
p2plotly <- ggplotly(p2)

subplot(p1plotly, p2plotly, widths = c(0.20, 0.80)) %>% 
  layout(title = list(text = "DE analysis Not-Inflamed vs Inflamed samples", x=0.15),
         legend = list(y = 0.5, tracegroupgap = 0.5))